install_github("link")
library("anydist")Unbiased mixed variables distances
supplementary material
Here we provide the code to reproduce the results of the supplementary material of the paper “Unbiased mixed variables distances”.
Code
CongruenceCoeff<-function(L1,L2){
L1<-dist(L1)
L2<-dist(L2)
CC<-sum(diag(t(L1) %*% L2)) / sqrt(sum(diag(t(L1) %*% L1))*sum(diag(t(L2) %*% L2)))
return(CC)
}Variable specific effects
Data generation
Code
reps<-100 # 100 can take 30min
#reps<-5 # 100 can take 30min
n<-500
k<-2 # Dimension solution
sigma<-0.03 # Noise
porig<-6 # Number of variables corresponding to true config
pnnoise<-0 # Number of numerical noise vars
pcnoise<-0 # Number of categorical noise vars
pn<-2 # Number of numerical variables underying config
pnum<-pn+pnnoise+pcnoise # Total number of numerical vars+noise vars (Needed to know which vars to discretize)
p<-porig+pnnoise+pcnoise # Total number of variables
qoptions<-c(2,3,5,9) # Distribution of the categorical variables corresponding to true config
pcat<-length(qoptions) # Number of cat variables underlying config
# We consider some variants:
presets<-c("custom","HLa","HLe","CatDissim","gower","unbiased_inter-dependent",
"euclidean_onehot", "unbiased_independent")
mad_importances<-cc_importances<-array(NA,dim=(c(reps,length(presets),p)))
set.seed(1234)
t0<-Sys.time()
for (rep in 1:reps){
# Generate 2d data:
T<-cbind(runif(n,-2,2),runif(n,-2,2)) # Random Uniform
SVDT <- svd(T)
Y<-SVDT$u # TU is orthonormal. This is the underlying 2d config
R<-NULL
for (i in 1:porig){
R<-cbind(R,runif(2,-2,2)) # or uniform. Seems to have little impact
}
X<-Y%*%R # Inflate/expand the data
Xorig<-as.data.frame(X)
X<-as.data.frame(Xorig)
# Add noise (we do this before making categorical; could also be after)
for (i in 1:porig){
X[,i]<-X[,i]+rnorm(n,0,sigma)
}
### Or: Add noise columns. Completely random numerical variables
N<-NULL
if (pnnoise>0){
for (i in 1:pnnoise){
N<-cbind(N,rnorm(n))
}
}
if (pcnoise>0){
qr[rep]<-round(runif(1,3,9))
for (i in 1:pcnoise){
Cj<-as.factor(round(runif(n,1.5,qr[rep]+0.5)))
if (is.null(N)){
N<-cbind(N,Cj)
N<-as.factor(N)
} else {
N<-cbind.data.frame(N,Cj)}
}
colnames(N)<-(1:(pnnoise+pcnoise))
}
if ((pnnoise+pcnoise)>0) {
Xorig<-cbind.data.frame(N,Xorig) # The random variables are the first pnoise
X<-cbind.data.frame(N,X)
}
# END add noise columns
for (i in (pnum+1):p){ # Create cat variables by simply evenly cutting based on range
X[,i]<-cut(X[,i],qoptions[i-pnum])
}
for (pr in 1:length(presets)){
# For each preset, first create full distance matrix:
if (pr==1){ # This is the real, numerical data: We use manhattan on scaled values:
Dall<-mdist(Xorig,preset=presets[pr],distance_cont = "manhattan",commensurable=FALSE, cont_scaling="std")
} else if (pr==2){ # This is an HL variant on the manipulated (5 original, 5 discretized) data
# Below we specify that we use manhattan, numerical is scaled, categorical uses HL scaling
Dall<-mdist(X,preset="custom",distance_cont = "manhattan",distance_cat = "matching", cat_scaling="HL", commensurable=FALSE, cont_scaling="std")
} else if (pr==3){ # HL Euclidean
Dall<-mdist(X,
distance_cont = "euclidean",
commensurable = FALSE,
cont_scaling = "std",
cat_scaling = "HLeucl")
} else if (pr==4){ # Catdissim
Dall<-mdist(X,preset="custom",cat_scaling = "cat_dis",commensurable = TRUE)
} else if (pr==5){ # Gower
Dall<-mdist(X,preset=presets[pr])*ncol(X)
} else { # The other presets are the 3 options
Dall<-mdist(X,preset=presets[pr])}
MDS_all <- cmdscale(Dall,eig=TRUE, k=2)
# Leave-one-out:
for (j in 1:p){
if (pr==1){
Dj<-mdist(Xorig[,-j],preset=presets[pr],distance_cont = "manhattan",commensurable=FALSE, cont_scaling="std")
} else if (pr==2){
Dj<-mdist(X[,-j],preset="custom",distance_cont = "manhattan",distance_cat = "matching", cat_scaling="HL", commensurable=FALSE, cont_scaling="std")
} else if (pr==3){ # HL 2
Dj<-mdist(X[,-j],distance_cont = "euclidean",
commensurable = FALSE,
cont_scaling = "std",
cat_scaling = "HLeucl")
} else if (pr==4){ # Unbiased catdissim
Dj<-mdist(X[,-j],preset="custom",
cat_scaling = "cat_dis",commensurable = TRUE)
} else if (pr==5){ # Gower
Dj<-mdist(X[,-j],preset=presets[pr])*(ncol(X)-1)
} else { # the remaining presets
Dj<-mdist(X[,-j],preset=presets[pr])}
mad_importances[rep,pr,j]<-mean(abs(Dall-Dj))
MDS_j <- cmdscale(Dj,eig=TRUE, k=2)
cc_j<-CongruenceCoeff(MDS_all$points[,1:2], MDS_j$points[,1:2])
cc_importances[rep,pr,j]<-cc_j
}
}
} # End replications
(elapsed<-Sys.time()-t0)
ac_importances<-sqrt((1-cc_importances^2)) # We calculated congruences
variants<-c("Num","HLa","HL", "Ustd", "G","Udep","Naive","Uind")
varreorder<-c(1,7,3,2,5,8,4,6)
mad_importances<-mad_importances[,varreorder,]
ac_importances<-ac_importances[,varreorder,]
mad<-ac<-madranks<-ccranks<-maddis<-madrel<-acrel<-NULL
mad_dists<-madrels<-acrels<-array(NA,dim=c(length(presets),reps,p))
for (pr in 1:length(presets)){
mad<-rbind(mad,colMeans(mad_importances[,pr,]))
ac<- rbind(ac,colMeans(ac_importances[,pr,]))
maddis<-rbind(maddis,colMeans(mad_importances[,pr,]/rowSums(mad_importances[,pr,])))
mad_dists[pr,,]<-mad_importances[,pr,]/rowSums(mad_importances[,pr,]) # We need all distributions
madrels[pr,,]<-mad_dists[pr,,]/mad_dists[1,,] # If larger: bigger diff than true
madrel<-rbind(madrel,colMeans(madrels[pr,,]))
acrels[pr,,]<-ac_importances[,pr,]/ac_importances[,1,] #
acrel<-rbind(acrel,colMeans(acrels[pr,,]))
}
Measures<-list()
Measures[[1]]<-data.frame(t(mad))
Measures[[2]]<-data.frame(t(maddis))
Measures[[3]]<-data.frame(t(ac))
Measures[[4]]<-data.frame(t(madrel))
Measures[[5]]<-data.frame(t(acrel))
Measures_tib= tibble(meas=Measures,
meas_name=c("MAD","MAD distribution","AC","MAD rel","AC rel"))
save(Measures_tib,file="data/Measures_tib.RData")Code for Figure 1
Code
variants<-c("Num","HLa","HL", "Ustd", "G","Udep","Naive","Uind")
varreorder<-c(1,7,3,2,5,8,4,6)
meas_to_plot_mad = Measures_tib |>
mutate(meas=map(.x=meas,function(x=.x){colnames(x)=variants[varreorder];return(x)}),
meas=map(.x=meas,~.x |> slice(c(3:6,1,2)))
) |>
unnest(meas) |>mutate(loo_v=paste0("var",rep(1:6,times=5))) |>
filter(meas_name %in% c("MAD"))
meas_to_plot_mad_rel = Measures_tib |>
mutate(meas=map(.x=meas,function(x=.x){colnames(x)=variants[varreorder];return(x)}),
meas=map(.x=meas,~.x |> slice(c(3:6,1,2)))
) |>
unnest(meas) |>mutate(loo_v=paste0("var",rep(1:6,times=5))) |>
filter(meas_name %in% c("MAD distribution"))
###################
mad_max = max(meas_to_plot_mad |> select(where(is.numeric)))
rect_data = tibble(
xmin_cont=4.5,xmax_cont=6,ymin_cont=0,ymax_cont=mad_max,
xmin_cat=.5,xmax_cat=4.5,ymin_cat=0,ymax_cat=mad_max)
mad_max_rel = max(meas_to_plot_mad_rel |> select(where(is.numeric)))
rect_data_rel = tibble(
xmin_cont=4.5,xmax_cont=6,ymin_cont=0,ymax_cont=mad_max_rel,
xmin_cat=.5,xmax_cat=4.5,ymin_cat=0,ymax_cat=mad_max_rel)
loo_mad_plot = meas_to_plot_mad |> pivot_longer(cols=Num:Udep, names_to="variant_name",values_to="value") |>
ggplot(aes(x = parse_number(loo_v), y = value,
color = variant_name)) +
geom_rect(data=rect_data,aes(xmin=xmin_cont,xmax=xmax_cont,ymin=ymin_cont,ymax=ymax_cont),
alpha = .2,fill="indianred",color="lightgrey",inherit.aes = FALSE)+
geom_rect(data=rect_data,aes(xmin=xmin_cat,xmax=xmax_cat,ymin=ymin_cat,ymax=ymax_cat),
alpha = .2,fill="dodgerblue",color="lightgrey",inherit.aes = FALSE)+
geom_point()+geom_line(aes(lty=variant_name)) +
ylab("absolute variable contribution to distance")+
xlab("left-out variable") + theme_bw() + theme(legend.position = "none")+
scale_x_continuous(breaks=seq(1,6,1),labels = c("2 cats", "3 cats", "5 cats", "9 cats", "num 1", "num 2"))+
theme(axis.text.x = element_text(angle = 60, hjust = 1))
loo_mad_rel_plot = meas_to_plot_mad_rel |> pivot_longer(cols=Num:Udep, names_to="variant_name",values_to="value") |>
ggplot(aes(x = parse_number(loo_v), y = value,
color = variant_name)) +
geom_rect(data=rect_data_rel,aes(xmin=xmin_cont,xmax=xmax_cont,ymin=ymin_cont,ymax=ymax_cont),
alpha = .2,fill="indianred",color="lightgrey",inherit.aes = FALSE)+
geom_rect(data=rect_data_rel,aes(xmin=xmin_cat,xmax=xmax_cat,ymin=ymin_cat,ymax=ymax_cat),
alpha = .2,fill="dodgerblue",color="lightgrey",inherit.aes = FALSE)+
geom_point()+geom_line(aes(lty=variant_name)) +
ylab("relative variable contribution to distance") +
xlab("left-out variable") + theme_bw() +
scale_x_continuous(breaks=seq(1,6,1),labels = c("2 cats", "3 cats", "5 cats", "9 cats", "num 1", "num 2"))+
theme(axis.text.x = element_text(angle = 60, hjust = 1))
fig_1_plot = loo_mad_plot | loo_mad_rel_plot
print(fig_1_plot, width = 8, height = 4)Code for Figure 2
Code
meas_to_plot_ac = Measures_tib |>
mutate(meas=map(.x=meas,function(x=.x){colnames(x)=variants[varreorder];return(x)}),
meas=map(.x=meas,~.x |> slice(c(3:6,1,2)))
) |>
unnest(meas) |>mutate(loo_v=paste0("var",rep(1:6,times=5))) |>
filter(meas_name %in% c("AC rel"))
ac_max = max(meas_to_plot_ac |> select(where(is.numeric)))
rect_data_ac = tibble(
xmin_cont=4.5,xmax_cont=6,ymin_cont=0,ymax_cont=ac_max,
xmin_cat=.5,xmax_cat=4.5,ymin_cat=0,ymax_cat=ac_max)
ac_plot = meas_to_plot_ac |> pivot_longer(cols=Num:Udep, names_to="variant_name",values_to="value") |>
ggplot(aes(x = parse_number(loo_v), y = value,
color = variant_name)) +
geom_rect(data=rect_data_ac,aes(xmin=xmin_cont,xmax=xmax_cont,ymin=ymin_cont,ymax=ymax_cont),
alpha = .2,fill="indianred",color="lightgrey",inherit.aes = FALSE)+
geom_rect(data=rect_data_ac,aes(xmin=xmin_cat,xmax=xmax_cat,ymin=ymin_cat,ymax=ymax_cat),
alpha = .2,fill="dodgerblue",color="lightgrey",inherit.aes = FALSE)+
geom_point()+geom_line(aes(lty=variant_name)) +
ylab("alienation coefficient")+
xlab("left-out variable") + theme_bw() +
scale_x_continuous(breaks=seq(1,6,1),labels = c("2 cats", "3 cats", "5 cats", "9 cats", "num 1", "num 2"))+
theme(axis.text.x = element_text(angle = 60, hjust = 1))
print(ac_plot, width = 8, height = 4)Retrieval of the true configuration
Data generation
Code
reps <- 100
n<-500 # large is needed for the many (9) categories scenario
k<-2
sigma<-0.03 # standard deviation of Xorig variables is
# is around 0.067
porig<-6 # Number of variables corresponding to true config
pnnoise<-0 # Number of numerical noise vars
pcnoise<-0 # Number of categorical noise vars
pn<-3 # Number of numerical variables underying config
pnum<-pn+pnnoise+pcnoise # Total number of numerical vars+noise vars (Needed to know which vars to discretize)
p<-porig+pnnoise+pcnoise # Total number of variables
qoptions<-c(2,3,5,9) # Distribution of the categorical variables corresponding to true config
pcat<-length(qoptions) # Number of cat variables underlying config
# We consider some variants:
presets<-c("custom","HL","HLEuclid","Unbiased_CatDissim", "gower","unbiased_inter-dependent", "euclidean_onehot","unbiased_independent")#,"gower2","cat dissim")
CC<-array(NA,dim=c(reps,length(presets),length(qoptions)))
qr<-rep(NA,reps)
set.seed(123)
t0<-Sys.time()
for (rep in 1:reps){
# Generate 2d data:
T<-cbind(runif(n,-2,2),runif(n,-2,2)) # Random Uniform
SVDT <- svd(T)
Y<-SVDT$u # TU is orthonormal. This is the underlying 2d config
R<-NULL
for (i in 1:porig){
R<-cbind(R,runif(2,-2,2)) # or uniform. Seems to have little impact
}
X<-Y%*%R # Inflate/expand the data
Xorig<-as.data.frame(X)
X<-as.data.frame(Xorig)
# Add noise (we do this before making categorical; could also be after)
for (i in 1:porig){
X[,i]<-X[,i]+rnorm(n,0,sigma)
}
### Or: Add noise columns. Completely random numerical variables
N<-NULL
if (pnnoise>0){
for (i in 1:pnnoise){
N<-cbind(N,rnorm(n))
}
}
if (pcnoise>0){
qr[rep]<-round(runif(1,3,9))
for (i in 1:pcnoise){
Cj<-as.factor(round(runif(n,1.5,qr[rep]+0.5)))
if (is.null(N)){
N<-cbind(N,Cj)
N<-as.factor(N)
} else {
N<-cbind.data.frame(N,Cj)}
}
colnames(N)<-(1:(pnnoise+pcnoise))
}
if ((pnnoise+pcnoise)>0) {
Xorig<-cbind.data.frame(N,Xorig) # The random variables are the first pnoise
X<-cbind.data.frame(N,X)
}
# END add noise columns
for (q in 1:length(qoptions)){
qs<-rep(qoptions[q],p-pnum) # Choice of categories.
# If number to large, we could get
# empty categories. All cat variables
# have same number of categories
pcat<-length(qs)
for (i in (pnum+1):p){ # Create cat variables by simply evenly cutting based on range
X[,i]<-cut(Xorig[,i],qs[i-pnum])
}
for (pr in 1:length(presets)){
# For each preset, first create full distance matrix:
if (pr==1){ # This is the real, numerical data: We use manhattan on scaled values:
Dall<-mdist(Xorig,preset=presets[pr],distance_cont = "manhattan",commensurable=FALSE, cont_scaling="std")
} else if (pr==2){ # Additive HL, numerical is scaled, categorical uses HL scaling
Dall<-mdist(X,preset="custom",distance_cont = "manhattan",distance_cat = "matching", cat_scaling="HL", commensurable=FALSE, cont_scaling="std")
} else if (pr==3){ # HL Euclid
Dall<-mdist(X,
distance_cont = "euclidean",
commensurable = FALSE,
cont_scaling = "std",
cat_scaling = "HLeucl")
} else if (pr==4){ # Cat dissimilarity
Dall<-mdist(X,preset="custom",cat_scaling = "cat_dis",commensurable = TRUE)
} else if (pr==5){ # Gower
Dall<-mdist(X,preset=presets[pr])*ncol(X)
} else { # The other presets are the remaining 3 options
if (pr==7){ # Catch error if no contvars
if ((pn+pnnoise)==0) {
print("Naive distances error: No numvars")
X1<-as.numeric(X[,1])
X2<-cbind.data.frame(X1,X[,-1])
Dall<-mdist(X2,preset=presets[pr])
} else { #pn+pnnoise !=0, pr==7
Dall<-mdist(X,preset=presets[pr])
}
} else {# pr is not 7
#print(pr)
Dall<-mdist(X,preset=presets[pr])}
}
MDS_all <- cmdscale(Dall,eig=TRUE, k=2)
CC[rep,pr,q]<-CongruenceCoeff(Y,MDS_all$points[,1:2])
} # Mix dist variants
} # Number of categories
} # End replications
save(CC,file="data/boxplot_CC_data.RData")Code for figure Figure 3
Code
reps <- 100
qoptions<-c(2,3,5,9)
AC<-sqrt((1-CC^2)) # We calculated congruences
MeasNames<-c("MAD","MAD Distr","Alienation Coeff", "MAD ranks", "CC ranks", "Relative MADs", "Relative ACs")
variants<-c("Num","HLa","HL", "Ustd", "G","Udep","Naive","Uind")
varreorder<-c(1,7,3,2,5,8,4,6)
str(AC)
## num [1:100, 1:8, 1:4] 0.227 0.166 0.187 0.167 0.167 ...
diff_AC<-array(NA,c(reps,7,4))
for(q in 1:length(qoptions)){
numeric_AC= matrix(rep(AC[,1,q],7),nrow=reps)
diff_AC[,,q]<-AC[,-1,q]-numeric_AC
}
AC_tib = rbind(as_tibble(data.frame(AC[,,1])) |> mutate(categories="two cats"),
as_tibble(data.frame(AC[,,2])) |> mutate(categories="three cats"),
as_tibble(data.frame(AC[,,3])) |> mutate(categories="five cats"),
as_tibble(data.frame(AC[,,4])) |> mutate(categories="nine cats")) |>
mutate(categories=fct_inorder(categories))
names(AC_tib) = c("Num","HLa","HL", "Ustd", "G","Udep","Naive","Uind","categories")
variant_ordering = c(1,7,3,2,5,8,4,6)
AC_tib = AC_tib |> select(variant_ordering,categories)
AC_tib_long = AC_tib |> pivot_longer(
cols=c("Num","Naive","HL", "HLa","G","Uind","Ustd","Udep"),
names_to="variants", values_to="diff_AC") |> mutate(variants=fct_inorder(variants))
gg_color_hue <- function(n) {
hues = seq(15, 375, length = n + 1)
hcl(h = hues, l = 65, c = 100)[1:n]
}
my_colors= c("darkgrey",gg_color_hue(7)[c(4,2,3,1,5,6,7)])
boxplot_figure_simu = AC_tib_long |> ggplot(aes(x=(variants), y=diff_AC))+
geom_boxplot(aes(fill=variants)) +
facet_wrap(~categories)+ theme_bw()+
scale_fill_manual(values=my_colors) + theme(legend.position = "none")+
ylab("Alienation coefficient") + xlab("Variants")
print(boxplot_figure_simu,width=8.5,height=6)Illustration: FIFA player data
Data preparation and analysis
Code
# Variables are organized: Categorical first, then numerical
load("data/Fifa21NL.RData") # Load the data
#Xorig<-X<-ddata[,-c(1,2)]
Xorig<-X<-ddata[,-c(1,2,19:24)] # vars 1 and 2 are a bit useless (too many categories)
# vars 19:24 interesting but same scale and little variation
# leaving them out makes the plots a bit smaller.
# leaving them in doesn't seem to change a lot
n<-nrow(X)
p<-ncol(X)
K<-10 # Number of clusters in cluster analysis
nd<-2 # Dimensionality of MDS solution
X=X[,c(8:1,9:16)] # Order
X=X[,-3] # Remove int rep: 380 have something, 21+7 in other cats
X=X[,c(1:12,14,13,15)]
p<-ncol(X)
presets<-c("HL","HLEuclid","Unbiased_CatDissim", "gower","unbiased_inter-dependent", "euclidean_onehot","unbiased_independent")#,"gower2","cat dissim")
mad_importances<-cc_importances<-cc_ranks<-mad_ranks<-matrix(NA,length(presets),p)
# ASWs<-array(NA,dim=c(length(presets),p+1,K-1))
#FullClusters<-matrix(NA,n,K-1)
# ARIs<-array(NA,dim=c(length(presets),p,K-1))
t0<-Sys.time()
for (pr in 1:length(presets)){
# For each preset, first create full distance matrix:
if (pr==1){ # This is an HL variant on the manipulated (5 original, 5 discretized) data
# Below we specify that we use manhattan, numerical is scaled, categorical uses HL scaling
Dall<-mdist(X,preset="custom",distance_cont = "manhattan",distance_cat = "matching", cat_scaling="HL", commensurable=FALSE, cont_scaling="std")
} else if (pr==2){ # HL Euclidean
Dall<-mdist(X,
distance_cont = "euclidean",
commensurable = FALSE,
cont_scaling = "std",
cat_scaling = "HLeucl")
} else if (pr==3){ # Catdissim
Dall<-mdist(X,preset="custom",cat_scaling = "cat_dis",commensurable = TRUE)
} else if (pr==4){ # Gower
Dall<-mdist(X,preset=presets[pr])*ncol(X)
# Dall<-mdist(X,distance_cat = "matching",
# distance_cont = "manhattan",
# cat_scaling = "none",
# cont_scaling = "range",
# commensurable = FALSE,
# preset = "custom")
} else { # The other presets are the 3 options
Dall<-mdist(X,preset=presets[pr])}
MDS_all <- cmdscale(Dall,eig=TRUE, k=nd) # Full MDS solution
# FullClusters<-matrix(NA,n,K-1) # Full cluster solution
# for (k in 2:K){ # we do this for k in 2:K
# pam.out<-pam(Dall,k)
# ASWs[pr,1,k-1]<-pam.out$silinfo$avg.width # if we want to compare asw
# FullClusters[,k-1]<-pam.out$clustering # For each K we get a cluster solution. We compare solutions leaving one-variable out later
# }
#
# Leave-one-out: (still same pr)
for (j in 1:p){
if (pr==1){
Dj<-mdist(X[,-j],preset="custom",distance_cont = "manhattan",distance_cat = "matching", cat_scaling="HL", commensurable=FALSE, cont_scaling="std")
} else if (pr==2){ # HL 2
Dj<-mdist(X[,-j],distance_cont = "euclidean",
commensurable = FALSE,
cont_scaling = "std",
cat_scaling = "HLeucl")
} else if (pr==3){ # Unbiased catdissim
Dj<-mdist(X[,-j],preset="custom",
cat_scaling = "cat_dis",commensurable = TRUE)
} else if (pr==4){ # Gower
Dj<-mdist(X[,-j],preset=presets[pr])*(ncol(X)-1)
# Dall<-mdist(X[,-j],distance_cat = "matching",
# distance_cont = "manhattan",
# cat_scaling = "none",
# cont_scaling = "range",
# commensurable = FALSE,
# preset = "custom")
} else { # the remaining presets
Dj<-mdist(X[,-j],preset=presets[pr])}
mad_importances[pr,j]<-mean(abs(Dall-Dj)) # Compare MAD
MDS_j <- cmdscale(Dj,eig=TRUE, k=nd) # Needed to compare MDS
cc_j<-CongruenceCoeff(MDS_all$points[,1:nd], MDS_j$points[,1:nd])
cc_importances[pr,j]<-cc_j # The congruence coefficients
# for (k in 2:K){ # Now compare for each k, the obtained clusters
# pam.out<-pam(Dj,k)
# ASWs[pr,j+1,k-1]<-pam.out$silinfo$avg.width
# ARIs[pr,j,k-1]<-ARI(FullClusters[,k-1],pam.out$clustering) # Compare,
# # for each k, all variable clusters with
# # leave-one-out clusters
#
# } # This gives per pr and k, the effect (ARI) of leaving individual variables out on the clustering
}
# rankings: (Not sure needed)
cc_ranks[pr,]<-rank(cc_importances[pr,])
# mad_ranks[pr,]<-rank(mad_importances[pr,])
}
# (elapsed<-Sys.time()-t0)
variants<-c("HLa","HL", "Ustd", "G","Udep","Naive","Uind")
varreorder<-c(6,2,1,4,7,3,5) # To get better sequence
mad_importances<-t(mad_importances)
cc_importances<-sqrt((1-cc_importances^2)) # Change to Alienation
cc_importances<-t(cc_importances)
varnames<-rownames(mad_importances) <-rownames(cc_importances)<-names(X)
#
varnames[7]<-"position"
#varnames[3]<-"int_repu"
varnames[1]<-"pref_foot"
varnames[15]<-"clause_eur"
varnames[6]<-"club"
#
colnames(cc_importances)<-colnames(mad_importances)<-variants
# Reorder appropriate fields:
mad_importances<-mad_importances[,varreorder]
cc_importances<-cc_importances[,varreorder]
# ARIs<-ARIs[varreorder,,]
# ASWs<-ASWs[varreorder,,]
# Create relative importance
madrel<-sweep(mad_importances,2,colSums(mad_importances),`/`)
rownames(mad_importances)<-rownames(cc_importances)<-varnamesCode for Figure 4
Code
max_y= max(mad_importances)
min_y= 0
max_y_rel= max(madrel)
min_y_rel= 0
min_x=0
max_x=p
max_y_cc=max(cc_importances)
rect_data = tibble(
xmin_cont=7.5,xmax_cont=15,ymin_cont=0,ymax_cont=max_y,
xmin_cat=.5,xmax_cat=7.5,ymin_cat=0,ymax_cat=max_y)
rect_data_rel = tibble(
xmin_cont=7.5,xmax_cont=15,ymin_cont=0,ymax_cont=max_y_rel,
xmin_cat=.5,xmax_cat=7.5,ymin_cat=0,ymax_cat=max_y_rel)
mad_imp_plot = mad_importances |> as_tibble() |>
mutate(variable_names=rownames(mad_importances),
var_number=1:nrow(mad_importances)) |>
pivot_longer(cols=Naive:Udep,names_to="variant",values_to="value") |>
ggplot(aes(x=var_number,y=value,color=variant))+
geom_rect(data=rect_data,aes(xmin=xmin_cont,xmax=xmax_cont,ymin=ymin_cont,ymax=ymax_cont),
alpha = .2,fill="indianred",color="lightgrey",inherit.aes = FALSE)+
geom_rect(data=rect_data,aes(xmin=xmin_cat,xmax=xmax_cat,ymin=ymin_cat,ymax=ymax_cat),
alpha = .2,fill="dodgerblue",color="lightgrey",inherit.aes = FALSE)+
geom_point()+geom_line()+
ylab("absolute variable contribution to distance") +
xlab("left-out variable") + theme_bw() + scale_x_continuous(breaks=seq(1,p,1),labels = varnames)+
theme(axis.text.x = element_text(angle = 60, hjust = 1),
legend.position = "none")
madrel_plot = madrel |> as_tibble() |>
mutate(variable_names=rownames(mad_importances),
var_number=1:nrow(mad_importances)) |>
pivot_longer(cols=Naive:Udep,names_to="variant",values_to="value") |>
ggplot(aes(x=var_number,y=value,color=variant))+
geom_rect(data=rect_data_rel,aes(xmin=xmin_cont,xmax=xmax_cont,ymin=ymin_cont,ymax=ymax_cont),
alpha = .2,fill="indianred",color="lightgrey",inherit.aes = FALSE)+
geom_rect(data=rect_data_rel,aes(xmin=xmin_cat,xmax=xmax_cat,ymin=ymin_cat,ymax=ymax_cat),
alpha = .2,fill="dodgerblue",color="lightgrey",inherit.aes = FALSE)+
geom_point()+geom_line()+
ylab("relative variable contribution to distance") +
xlab("left-out variable") + theme_bw() + scale_x_continuous(breaks=seq(1,p,1),labels = varnames)+
theme(axis.text.x = element_text(angle = 60, hjust = 1))
fifa_mad = mad_imp_plot | madrel_plot
print(fifa_mad, width = 10, height = 5)Code for Figure 5
Code
max_y_cc=max(cc_importances)
rect_data_cc = tibble(
xmin_cont=7.5,xmax_cont=15,ymin_cont=0,ymax_cont=max_y_cc,
xmin_cat=.5,xmax_cat=7.5,ymin_cat=0,ymax_cat=max_y_cc)
cc_imp_plot = cc_importances |> as_tibble() |>
mutate(variable_names=rownames(cc_importances),
var_number=1:nrow(cc_importances)) |>
pivot_longer(cols=Naive:Udep,names_to="variant",values_to="value") |>
ggplot(aes(x=var_number,y=value,color=variant))+
geom_rect(data=rect_data_cc,aes(xmin=xmin_cont,xmax=xmax_cont,ymin=ymin_cont,ymax=ymax_cont),
alpha = .2,fill="indianred",color="lightgrey",inherit.aes = FALSE)+
geom_rect(data=rect_data_cc,aes(xmin=xmin_cat,xmax=xmax_cat,ymin=ymin_cat,ymax=ymax_cat),
alpha = .2,fill="dodgerblue",color="lightgrey",inherit.aes = FALSE)+
geom_point()+geom_line()+
ylab("alienation coefficient") +
xlab("left-out variable") + theme_bw() + scale_x_continuous(breaks=seq(1,p,1),labels = varnames)+
theme(axis.text.x = element_text(angle = 60, hjust = 1))
print(cc_imp_plot,width = 8, height = 4)Appendix D Categorical mean distances: skewed distributions
Data generation
Code
# Let's consider some specific unbalanced scenarios
# One probility is d. The others are (1-d)/(q-1)
# We consider skewed options:
dis<-c(0.05,0.1,0.2,0.33,0.5,0.66,0.8,0.9,0.95)
#qi=5
qs<-c(2,3,5,10)#,18) # Some options for qi
nint<-length(dis) # Number of options for d (nint-1)
rsm<-rsma<-rhl<-sdsm<-rse<-rsof<-rsiof<-sdse<-rsa<-sdsa<-sdsof<-sdsiof<-sddsma<-sdssm<-
sddse<-sddsa<-sddsm<-sdhl<-sddsmca<-sddsof<-sddsiof<-sddshl<- sdshl<-
rsmca<-sdsmca<-esm<-em<-ee<-ea<-ehl<-emca<-eof<-eiof<-matrix(NA,nrow=nint,ncol=length(qs))
#par(mfrow=c(2,2))
for (qis in 1:length(qs)){
#ds<-NULL
qi<-qs[qis]
for (i in (1:nint)){ # variant 2
Deltam<-matrix(1,qi,qi) # Matching
diag(Deltam)<-rep(0,qi) # Matching Delta
#d=i/(2*qi) # i=2 corresponds to equal probs if variant 1 is chosen. (Only works for one qi. Not for qs loop)
d=dis[i] # variant 2
#ds<-rbind(ds,d)
p<-c(d,rep((1-d)/(qi-1),(qi-1)))
sum(p)
Dp<-diag(p)
pp<-p %*% t(p)
spi<-(diag(Dp-pp))^-1
spi5<-(diag(Dp-pp))^-.5
DEsk<-(2/(qi^2))*Deltam # Delta Eskin
DOF<-log(p)%*%t(log(p)) * Deltam # OF
n=160 # For inv OF we need n. Soon gets very large (increases in n) 160 is sufficient
#n<-ceiling(1/min(p))
DIOF<-log(n*p)%*%t(log(n*p)) * Deltam # inverse OF. If some np<1 we can get negatives
n=160 # For HL we need to find a way to get somewhat consistent values. Let's choose n large
marginals=p*n
HLemp<-distancefactor(cat=qi,n=n,catsizes=marginals) #
DHL<-Deltam*HLemp*2#*sqrt(2) # NOTE 13/09: Corrected: Was sqrt(2). But, should be, see paper, 2x
#DE<-sqrt((1/qi)*(spi %*% t(rep(1,qi)) + t(spi %*% t(rep(1,qi)))))
#diag(DE)<-rep(0,qi)
DM<-sqrt(1/qi)*(spi5 %*% t(rep(1,qi)) + t(spi5 %*% t(rep(1,qi))))
diag(DM)<-rep(0,qi)
DA<-(1/qi)*diag(spi5)%*%Deltam%*%diag(spi5)
DMCA<-(1/qi) * diag(p^-.5)%*%Deltam%*%diag(p^-.5)
#we<-ee[i,(qis)]<- t(p)%*%DE%*%p
wsm<-esm[i,(qis)]<- t(p)%*%Deltam%*%p # Simple Matching
we<-ee[i,(qis)]<- t(p)%*%DEsk%*%p # Eskin
wof<-eof[i,(qis)]<-t(p)%*%DOF%*%p # OF
wiof<-eiof[i,(qis)]<-t(p)%*%DIOF%*%p # IOF
whl<-ehl[i,(qis)]<-t(p)%*%DHL%*%p # HL
wm<-em[i,(qis)]<- t(p)%*%DM%*%p # SD scaling
wa<-ea[i,(qis)]<- t(p)%*%DA%*%p # Cat dis scaling
wmca<-emca[i,(qis)]<- t(p)%*%DMCA%*%p # MCA scaling
# We can also estimate the variance among the distances using Deltas: E(X^2)-E(X)^2
# We can do this dividing by expected values so that we look at what happens
# when delta is commensurable. Or not
# if not:
#we<-wm<-wa<-wmca<-wof<-wiof<-whl<-wsm<-1
sddse[i,(qis)]<- sqrt(t(p)%*%DEsk^2%*%p - ee[i,(qis)]^2)/we # Eskin
sddsma[i,(qis)]<- sqrt(t(p)%*%Deltam^2%*%p - esm[i,(qis)]^2)/ wsm # Simple Matching
sddshl[i,(qis)]<- sqrt(t(p)%*%DHL^2%*%p - ehl[i,(qis)]^2)/ whl # HL
sddsm[i,(qis)]<- sqrt(t(p)%*%DM^2%*%p - em[i,(qis)]^2)/ wm # SD
sddsa[i,(qis)]<- sqrt(t(p)%*%DA^2%*%p - ea[i,(qis)]^2)/wa # Cat dissim
sddsmca[i,(qis)]<- sqrt(t(p)%*%DMCA^2%*%p - emca[i,(qis)]^2)/wmca # MCA
sddsof[i,(qis)]<-sqrt(t(p)%*%DOF^2%*%p - eof[i,(qis)]^2)/wof # OF
sddsiof[i,(qis)]<-sqrt(t(p)%*%(DIOF/as.numeric(wiof))^2%*%p - 1 ) #IOF
# Let's consider spread among cat dissimilarities. But: This only makes sense
# when on similar scales. So, use commensurable deltas: (Or not: see above)
sdse[i,qis]<-sd((1/we)*DEsk[upper.tri(DEsk)]) # Eskin
sdsm[i,qis]<-sd((1/wm)*DM[upper.tri(DM)]) # SD
sdshl[i,qis]<-sd((1/whl)*DHL[upper.tri(DHL)]) # HL
sdssm[i,qis]<-sd((1/wsm)*Deltam[upper.tri(Deltam)]) # Simple Matching
sdsa[i,qis]<-sd((1/wa)*DA[upper.tri(DA)]) # Cat dissim,
sdsmca[i,qis]<-sd((1/wmca)*DMCA[upper.tri(DMCA)]) # MCA
sdsof[i,qis]<-sd((1/wof)*DOF[upper.tri(DOF)]) # OF
sdsiof[i,qis]<-sd((1/wiof)*DIOF[upper.tri(DIOF)]) # IOF
}
}
cats<-as.character(qs)
skew<-as.character(dis)#as.character(1:nint)Code for Figure 6
Code
sk_distance_to_plot = function(exp_mean_dist,distance_name){
exp_mean_dist = as_tibble(exp_mean_dist) |> mutate(`skewness (p1)` = c(0.05,0.1,0.2,0.33,0.5,0.66,0.8,0.9,0.95)) |>
rename(`2 cats` = V1,
`3 cats` = V2,
`5 cats` = V3,
`10 cats` = V4) |>
mutate(method=distance_name) |>
pivot_longer(cols = c(`2 cats`, `3 cats`, `5 cats`, `10 cats`),
names_to = "Number of categories", values_to = "Mean distance")
return(exp_mean_dist)
}
all_exp_mean_dist=rbind(sk_distance_to_plot(esm,"Simple Matching"),
sk_distance_to_plot(ee,"Eskin"),
sk_distance_to_plot(eof,"OF"),
sk_distance_to_plot(eiof,"IOF")
)
paper_plot_skew1=all_exp_mean_dist |>
mutate(`Number of categories`=fct_inorder(`Number of categories`),
method=fct_inorder(method),) |>
ggplot(aes(y=`Mean distance`,x=`skewness (p1)`,color=`Number of categories`)) +
geom_line() + geom_point() + facet_wrap(~method,scales = "free_y") + theme_minimal() + labs(x=expression("Skewness (p1)"), y="Mean distance") +
xlim(c(0,1))+expand_limits(y = 0)+theme_bw()+theme(legend.position = "bottom")
print(paper_plot_skew1,width=10,height=6)Code for Figure 7
Code
all_exp_mean_dist2=rbind(sk_distance_to_plot(ehl,"Hennig-Liao"),
sk_distance_to_plot(em,"Std. dev."),
sk_distance_to_plot(ea,"Cat. dissim.")
)
paper_plot_skew2=all_exp_mean_dist2 |>
mutate(`Number of categories`=fct_inorder(`Number of categories`),
method=fct_inorder(method),) |>
ggplot(aes(y=`Mean distance`,x=`skewness (p1)`,color=`Number of categories`)) +
geom_line() + geom_point() + facet_wrap(~method) + theme_minimal() + labs(x=expression("Skewness (p1)"), y="Mean distance") +
xlim(c(0,1))+expand_limits(y = 0)+theme_bw()+theme(legend.position = "bottom")
print(paper_plot_skew2)Code for Figure 8
Code
qs<-c(2,3,5,10)#,18) # Some options for qs
nint<-length(dis) # Number of options for d (nint-1)
CVchd<-CVtvd<-CVmsd<-CVlhd<-Echi<-Echd<-Etvi<-Etvd<-Emsi<-Elhi<-Elhd<-Emsd<-array(NA,dim=c(length(qs),length(qs),nint,nint))
for (q1 in 1:length(qs)){ # cats row
qi<-qs[q1]
for (q2 in q1:length(qs)){ # cats cols
qj<-qs[q2]
for (pi in 1:nint){ # row distributions
d1<-dis[pi]
p1<-c(d1,rep((1-d1)/(qj-1),(qj-1))) # nint different marginals: Distribution over colums
for (pj in 1:nint){
d2<-dis[pj]
p2<-c(d2,rep((1-d2)/(qi-1),(qi-1))) # corresponding col marginals: Distribution over rows
Pi<-p2 %*% t(p1) # joint table independent
Ri<-Pi/rowSums(Pi) # conditional table independent
# Ci<-Ri/sqrt(p2)
if (qj>qi) {
Pd<-diag(p1[1:qi])
Addzeros<-matrix(0,(qi-1),(qj-qi))
Addps<-p1[qi+1:(qj-qi)]
Add<-rbind(Addzeros,Addps)
Pd<-cbind(Pd,Add)
} else { #qi=qj
pd<-p2
Pd<-diag(pd) # joint table dependent case (diagonal smallest q)
}
Rd<-Pd/rowSums(Pd)
Cd<-Rd/sqrt(p2) # Chi-square table: cond. prob/sqrt(p)
# category dissimilarities:
Dlhd<-LeHo(Rd) # Le Ho
# Dmsd<-MousaviSehhati(Pd) # Moussavi
Dtvd<-dist(Rd,"manhattan")/2 # TVD
Dchd<-dist(Cd,"euclidean")^2 # Chi2 dist
# Dchd<-Dchd/(2*(qi-1)) # Chi2 dist corrected for number of categories...
Elhd[q1,q2,pi,pj]<-t(pd) %*%Dlhd %*%pd
# Emsd[q1,q2,pi,pj]<-t(pd) %*%Dmsd %*%pd
Etvd[q1,q2,pi,pj]<-t(pd) %*%as.matrix(Dtvd) %*%pd
Echd[q1,q2,pi,pj]<-t(pd) %*%as.matrix(Dchd) %*%pd
# Coefficient of variation
CVlhd[q1,q2,pi,pj]<-sqrt(t(pd) %*%Dlhd^2 %*%pd - (t(pd) %*%Dlhd %*%pd)^2)/(t(pd) %*%Dlhd %*%pd)
# CVmsd[q1,q2,pi,pj]<-sqrt(t(pd) %*%Dmsd^2 %*%pd - (t(pd) %*%Dmsd %*%pd)^2)/(t(pd) %*%Dmsd %*%pd)
CVtvd[q1,q2,pi,pj]<-sqrt(t(pd) %*%(as.matrix(Dtvd))^2 %*%pd - (t(pd) %*%as.matrix(Dtvd) %*%pd)^2)/(t(pd) %*%as.matrix(Dtvd) %*%pd)
CVchd[q1,q2,pi,pj]<-sqrt(t(pd) %*%(as.matrix(Dchd))^2 %*%pd - (t(pd) %*%as.matrix(Dchd) %*%pd)^2)/(t(pd) %*%as.matrix(Dchd) %*%pd)
Dlhi<-LeHo(Ri)
# Dmsi<-MousaviSehhati(Pi)
Dtvi<-dist(Ri,"manhattan")/2
# Dchi<-dist(Ci,"euclidean")^2
Elhi[q1,q2,pi,pj]<-t(p2) %*%Dlhi %*%p2
# Emsi[q1,q2,pi,pj]<-t(p2) %*%Dmsi %*%p2
Etvi[q1,q2,pi,pj]<-t(p2) %*%as.matrix(Dtvi) %*%p2
# Echi[q1,q2,pi,pj]<-t(p1) %*%as.matrix(Dchi) %*%p1
}
}
}
}
Elhs<-as.data.frame(rbind(Elhd[1,1,1,],Elhd[2,2,1,],Elhd[3,3,1,],Elhd[4,4,1,]))
Etvs<-as.data.frame(rbind(Etvd[1,1,1,],Etvd[2,2,1,],Etvd[3,3,1,],Etvd[4,4,1,]))
Emss<-as.data.frame(rbind(Emsd[1,1,1,],Emsd[2,2,1,],Emsd[3,3,1,],Emsd[4,4,1,]))
Echs<-as.data.frame(rbind(Echd[1,1,1,],Echd[2,2,1,],Echd[3,3,1,],Echd[4,4,1,]))
CVlhs<-as.data.frame(rbind(CVlhd[1,1,1,],CVlhd[2,2,1,],CVlhd[3,3,1,],CVlhd[4,4,1,]))
CVtvs<-as.data.frame(rbind(CVtvd[1,1,1,],CVtvd[2,2,1,],CVtvd[3,3,1,],CVtvd[4,4,1,]))
CVmss<-as.data.frame(rbind(CVmsd[1,1,1,],CVmsd[2,2,1,],CVmsd[3,3,1,],CVmsd[4,4,1,]))
CVchs<-as.data.frame(rbind(CVchd[1,1,1,],CVchd[2,2,1,],CVchd[3,3,1,],CVchd[4,4,1,]))
cats<-as.character(qs)
skew<-as.character(dis)#as.character(1:nint)
sk_distance_to_plot = function(exp_mean_dist,distance_name){
exp_mean_dist = as_tibble(exp_mean_dist) |> mutate(`skewness (p1)` = c(0.05,0.1,0.2,0.33,0.5,0.66,0.8,0.9,0.95)) |>
rename(`2 cats` = V1,
`3 cats` = V2,
`5 cats` = V3,
`10 cats` = V4) |>
mutate(method=distance_name) |>
pivot_longer(cols = c(`2 cats`, `3 cats`, `5 cats`, `10 cats`),
names_to = "Number of categories", values_to = "Mean distance")
return(exp_mean_dist)
}
all_exp_mean_dist3=rbind(sk_distance_to_plot(t(as.matrix(Elhs)),"Le-Ho"),
sk_distance_to_plot(t(as.matrix(Etvs)),"Total Variance Distance")
)
paper_plot_skew3=all_exp_mean_dist3 |>
mutate(`Number of categories`=fct_inorder(`Number of categories`),
method=fct_inorder(method),) |>
ggplot(aes(y=`Mean distance`,x=`skewness (p1)`,color=`Number of categories`)) +
geom_line() + geom_point() + facet_wrap(~method,scales="free_y") + theme_minimal() + labs(x=expression("Skewness (p1)"), y="Mean distance") +
xlim(c(0,1))+expand_limits(y = 0)+theme_bw()+theme(legend.position = "bottom")
print(paper_plot_skew3,width=8,height=4)Appendix E FIFA variable distributions
Code for Figure 9
Code
X_tib_num = X |> as_tibble() |> select(where(is.numeric))
X_tib_cat = X |> as_tibble() |> select(where(is.factor))
levels(X_tib_cat$work_rate)=c("H/H", "H/L", "H/M", "L/H", "L/L", "L/M", "M/H", "M/L", "M/M")
levels(X_tib_cat$club_name) = c("ADO","Ajax","AZ","Emmen","Groningen","Twente","Utrecht",
"Feyenoord","Fortuna","Heracles","PEC","PSV","RKC","Heerenveen",
"Sparta","Vitesse","VVV","Willem II")
X_tib_cat=X_tib_cat |> rename(`club`=club_name,
`position`=team_position)
X_tib_num_nest = tibble(values=X_tib_num |> map((~.x)),features=names(X_tib_num))|>
mutate(
hplot=map2(.x=values,.y=features,
function(x=.x,y=.y){
my_tib=tibble(a=x)
names(my_tib) = y
my_plot= my_tib |> ggplot(aes(x)) + geom_histogram(bins=15,alpha=.75)+theme_bw()+ggtitle(y)+
theme(plot.title = element_text(hjust = 0.5))+xlab("")+ylab("")
}
)
)
X_tib_cat_nest = tibble(values=X_tib_cat |> map((~.x)),features=names(X_tib_cat))|>
mutate(
bplot=map2(.x=values,.y=features,
function(x=.x,y=.y){
my_tib=tibble(a=x)
names(my_tib) = y
my_plot= my_tib |> ggplot(aes(x)) + geom_bar(alpha=.75)+theme_bw()+ggtitle(y)+
theme(plot.title = element_text(hjust = 0.5),
axis.text.x =element_text(angle=90 ))+xlab("")+ylab("")
}
)
)
fifa_descriptive_plots=(X_tib_cat_nest$bplot[[1]]/
X_tib_cat_nest$bplot[[2]]/
X_tib_cat_nest$bplot[[3]]/
X_tib_cat_nest$bplot[[4]]/
X_tib_cat_nest$bplot[[5]]/
X_tib_cat_nest$bplot[[6]]/
X_tib_cat_nest$bplot[[7]])|
(X_tib_num_nest$hplot[[1]]/
X_tib_num_nest$hplot[[2]]/
X_tib_num_nest$hplot[[3]]/
X_tib_num_nest$hplot[[4]]/
X_tib_num_nest$hplot[[5]]/
X_tib_num_nest$hplot[[6]]/
X_tib_num_nest$hplot[[7]])#+ plot_layout(guides = 'collect')
print(fifa_descriptive_plots)